- Notifications
You must be signed in to change notification settings - Fork 849
/
Copy pathOML4Py Feature Selection Using Summary Statistics.dsnb
executable file
·1 lines (1 loc) · 18.3 KB
/
OML4Py Feature Selection Using Summary Statistics.dsnb
1
[{"layout":null,"template":null,"templateConfig":null,"name":"OML4Py Feature Selection Using Summary Statistics","description":null,"readOnly":false,"type":"low","paragraphs":[{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"html","title":null,"message":[],"enabled":true,"result":{"startTime":1715709329768,"interpreter":"md.low","endTime":1715709329889,"results":[],"taskStatus":"SUCCESS","forms":"[]","status":"SUCCESS"},"sizeX":0,"hideCode":true,"width":0,"hideResult":true,"dynamicFormParams":null,"row":0,"hasTitle":false,"hideVizConfig":true,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"html","title":null,"message":["%md","# OML4Py Feature Selection: Summary Statistics","","In this notebook, we demonstrate how to perform feature selection using summary statistics via OML4Py.","","We use the customer insurance lifetime value data set which contains customer financial information, lifetime value, and whether or not the customer bought insurance.","","We show how to use OML4Py to select features based on number of distinct values, null values, proportion of constant values. ","","The dataset `CUSTOMER_INSURANCE_LTV_PY` is generated by the `\"OML Run-me-first\"` notebook, which `MUST` be run before this notebook.","","---","","##### `IMPORTANT`: The `\"OML Run-me-first\"` notebook is available under the menu Templates -> Examples and is a pre-requisite to the current notebook.","","---","","Copyright (c) 2024 Oracle Corporation ","###### <a href=\"https://oss.oracle.com/licenses/upl/\" onclick=\"return ! window.open('https://oss.oracle.com/licenses/upl/');\">The Universal Permissive License (UPL), Version 1.0<\/a>","---"],"enabled":true,"result":{"startTime":1715709329963,"interpreter":"md.low","endTime":1715709330037,"results":[{"message":"<h1 id=\"oml4py-feature-selection-summary-statistics\">OML4Py Feature Selection: Summary Statistics<\/h1>\n<p>In this notebook, we demonstrate how to perform feature selection using summary statistics via OML4Py.<\/p>\n<p>We use the customer insurance lifetime value data set which contains customer financial information, lifetime value, and whether or not the customer bought insurance.<\/p>\n<p>We show how to use OML4Py to select features based on number of distinct values, null values, proportion of constant values.<\/p>\n<p>The dataset <code>CUSTOMER_INSURANCE_LTV_PY<\/code> is generated by the <code>"OML Run-me-first"<\/code> notebook, which <code>MUST<\/code> be run before this notebook.<\/p>\n<hr />\n<h5 id=\"important-the-oml-run-me-first-notebook-is-available-under-the-menu-templates---examples-and-is-a-pre-requisite-to-the-current-notebook\"><code>IMPORTANT<\/code>: The <code>"OML Run-me-first"<\/code> notebook is available under the menu Templates -> Examples and is a pre-requisite to the current notebook.<\/h5>\n<hr />\n<p>Copyright (c) 2024 Oracle Corporation<\/p>\n<h6 id=\"the-universal-permissive-license-upl-version-10\"><a href=\"https://oss.oracle.com/licenses/upl/\" onclick=\"return ! window.open('https://oss.oracle.com/licenses/upl/');\">The Universal Permissive License (UPL), Version 1.0<\/a><\/h6>\n<hr />\n","type":"HTML"}],"taskStatus":"SUCCESS","forms":"[]","status":"SUCCESS"},"sizeX":0,"hideCode":true,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":false,"hideVizConfig":true,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"html","title":"For more information ...","message":["%md","","* <a href=\"https://docs.oracle.com/en/cloud/paas/autonomous-data-warehouse-cloud/index.html\" target=\"_blank\">Oracle ADB Documentation<\/a>","* <a href=\"https://github.com/oracle-samples/oracle-db-examples/tree/main/machine-learning\" target=\"_blank\">OML folder on Oracle GitHub<\/a>","* <a href=\"https://www.oracle.com/machine-learning\" target=\"_blank\">OML Web Page<\/a>","* <a href=\"https://docs.oracle.com/en/database/oracle/machine-learning/oml4py/1/mlpug/about-preparing-data.html\" target=\"_blank\">OML Data Preparation<\/a>"],"enabled":true,"result":{"startTime":1715709330123,"interpreter":"md.low","endTime":1715709330197,"results":[{"message":"<ul>\n<li><a href=\"https://docs.oracle.com/en/cloud/paas/autonomous-data-warehouse-cloud/index.html\" target=\"_blank\">Oracle ADB Documentation<\/a><\/li>\n<li><a href=\"https://github.com/oracle-samples/oracle-db-examples/tree/main/machine-learning\" target=\"_blank\">OML folder on Oracle GitHub<\/a><\/li>\n<li><a href=\"https://www.oracle.com/machine-learning\" target=\"_blank\">OML Web Page<\/a><\/li>\n<li><a href=\"https://docs.oracle.com/en/database/oracle/machine-learning/oml4py/1/mlpug/about-preparing-data.html\" target=\"_blank\">OML Data Preparation<\/a><\/li>\n<\/ul>\n","type":"HTML"}],"taskStatus":"SUCCESS","forms":"[]","status":"SUCCESS"},"sizeX":0,"hideCode":true,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":true,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"html","title":"Import libraries and set display options","message":["%python","","import warnings","warnings.filterwarnings('ignore')","","import pandas as pd","import oml","from oml import automl","","pd.set_option('display.max_rows', 500)","pd.set_option('display.max_columns', 500)","pd.set_option('display.width', 1000)"],"enabled":true,"result":{"startTime":1715709330274,"interpreter":"python.low","endTime":1715709333375,"results":[],"taskStatus":"SUCCESS","forms":"[]","status":"SUCCESS"},"sizeX":0,"hideCode":false,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"html","title":"Get proxy object for CUSTOMER_INSURANCE_LTV_PY table","message":["%python","","CUST_DF = oml.sync(table = 'CUSTOMER_INSURANCE_LTV_PY')"],"enabled":true,"result":{"startTime":1715709333448,"interpreter":"python.low","endTime":1715709333633,"results":[],"taskStatus":"SUCCESS","forms":"[]","status":"SUCCESS"},"sizeX":0,"hideCode":false,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"table","title":"Display a few rows from the table","message":["%python","","z.show(CUST_DF.head())"],"enabled":true,"result":{"startTime":1715709333704,"interpreter":"python.low","endTime":1715709333888,"results":[{"message":"MARITAL_STATUS\tSTATE\tCREDIT_BALANCE\tCUSTOMER_TENURE\tMORTGAGE_AMOUNT\tBANK_FUNDS\tNUM_DEPENDENTS\tHAS_CHILDREN\tINCOME\tCUSTOMER_ID\tGENDER\tPROFESSION\tCREDIT_CARD_LIMITS\tREGION\tHOME_OWNERSHIP\tNUM_ONLINE_TRANS\tBUY_INSURANCE\tMONTHLY_CHECKS\tNUM_TRANS_KIOSK\tAGE\tMONEY_MONTLY_OVERDRAWN\tLTV\tTOTAL_AUTOM_PAYMENTS\tNUM_TRANS_TELLER\tCHECKING_BALANCE\tNUM_TRANS_ATM\tLTV_BIN\tFIRST_NAME\tNUM_MORTGAGES\tCAR_OWNERSHIP\tLAST_NAME\nSINGLE\tCA \t0\t3\t400\t590\t3\t1\t61384\tCU8720 \tM \tWaiter/Waitress\t1000\tWest\t1\t400\tYes\t2\t1\t23\t53.98\t18146.0\t208\t2\t25\t5\tMEDIUM\tKENNETH\t1\t1\tNADEAU\nDIVORCED\tWA \t0\t3\t2050\t650\t3\t1\t63525\tCU8721 \tM \tSoftware Engineer\t1000\tWest\t1\t2050\tYes\t13\t3\t22\t54.35\t18581.25\t242\t2\t25\t5\tMEDIUM\tJARVIS\t1\t1\tCLARKE\nDIVORCED\tNY \t0\t4\t2500\t700\t1\t0\t71504\tCU8722 \tF \tIT Staff\t1000\tNorthEast\t1\t2500\tNo\t1\t1\t42\t53.79\t32076.0\t393\t3\t118\t4\tVERY HIGH\tSTACEY\t1\t1\tHENSON\nMARRIED\tMI \t0\t1\t8320\t11050\t1\t1\t72308\tCU8665 \tF \tDentist\t4000\tMidwest\t1\t1700\tYes\t14\t1\t33\t53.97\t24877.0\t650\t4\t25\t5\tHIGH\tMARGRET\t1\t1\tRYAN\nMARRIED\tMI \t0\t1\t1000\t0\t1\t1\t64710\tCU8666 \tM \tIT Staff\t700\tMidwest\t1\t1000\tNo\t4\t1\t38\t53.17\t23477.5\t383\t0\t374\t2\tHIGH\tJOAN\t1\t1\tBAUER\n","type":"TABLE"}],"taskStatus":"SUCCESS","forms":"[]","status":"SUCCESS"},"sizeX":0,"hideCode":false,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"raw","title":"Identify categorical columns","message":["%python","","cat_cols = CUST_DF.select_types([oml.core.string.String]).columns","print(cat_cols)"],"enabled":true,"result":{"startTime":1715709333968,"interpreter":"python.low","endTime":1715709334092,"results":[{"message":"['MARITAL_STATUS', 'STATE', 'CUSTOMER_ID', 'GENDER', 'PROFESSION', 'REGION', 'BUY_INSURANCE', 'LTV_BIN', 'FIRST_NAME', 'LAST_NAME']\n","type":"TEXT"}],"taskStatus":"SUCCESS","forms":"[]","status":"SUCCESS"},"sizeX":0,"hideCode":false,"width":6,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"raw","title":"Identify numeric columns","message":["%python","","num_cols = CUST_DF.select_types([oml.core.string.Float]).columns","print(num_cols)"],"enabled":true,"result":{"startTime":1715709334166,"interpreter":"python.low","endTime":1715709334245,"results":[{"message":"['MONEY_MONTLY_OVERDRAWN', 'LTV']\n","type":"TEXT"}],"taskStatus":"SUCCESS","forms":"[]","status":"SUCCESS"},"sizeX":0,"hideCode":false,"width":6,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"table","title":"Check the unique values in each column","message":["%python","","unique_counts = CUST_DF.nunique()","unique_df = pd.DataFrame(unique_counts)","unique_df = unique_df.reset_index()","unique_df.columns = ['FEATURE', 'CNT']","z.show(unique_df.head(10))"],"enabled":true,"result":{"startTime":1715709334325,"interpreter":"python.low","endTime":1715709334648,"results":[{"message":"FEATURE\tCNT\nMARITAL_STATUS\t7\nSTATE\t24\nCREDIT_BALANCE\t963\nCUSTOMER_TENURE\t5\nMORTGAGE_AMOUNT\t1763\nBANK_FUNDS\t1178\nNUM_DEPENDENTS\t7\nHAS_CHILDREN\t2\nINCOME\t6374\nCUSTOMER_ID\t13880\n","type":"TABLE"}],"taskStatus":"SUCCESS","forms":"[]","status":"SUCCESS"},"sizeX":0,"hideCode":false,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"raw","title":"Filter categorical 'identifier' columns - filter out columns with > 95% unique values","message":["%python","","selected_cat_cols = []","total = CUST_DF.shape[0]","ratio = 0.95","for col in cat_cols:"," cnt = unique_df[unique_df['FEATURE'] == col]['CNT'].values[0]"," if cnt > 1 and cnt < ratio*total:"," selected_cat_cols.append(col)","print(selected_cat_cols)","print(len(selected_cat_cols))"],"enabled":true,"result":{"startTime":1715709334729,"interpreter":"python.low","endTime":1715709334927,"results":[{"message":"['MARITAL_STATUS', 'STATE', 'CUSTOMER_ID', 'GENDER', 'PROFESSION', 'REGION', 'BUY_INSURANCE', 'LTV_BIN', 'FIRST_NAME', 'LAST_NAME']\n10\n","type":"TEXT"}],"taskStatus":"SUCCESS","forms":"[]","status":"SUCCESS"},"sizeX":0,"hideCode":false,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"raw","title":"To confirm selected column is not an 'identifier', use crosstab function to display unique values","message":["%python","","col = 'MARITAL_STATUS'","CUST_DF.crosstab([col])"],"enabled":true,"result":{"startTime":1715709335003,"interpreter":"python.low","endTime":1715709335325,"results":[{"message":" MARITAL_STATUS count\n0 WIDOWED 475\n1 M 359\n2 OTHER 96\n3 DIV 337\n4 DIVORCED 2063\n5 None 6627\n6 MARRIED 2393\n7 SINGLE 2530\n","type":"TEXT"}],"taskStatus":"SUCCESS","forms":"[]","status":"SUCCESS"},"sizeX":0,"hideCode":false,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"table","title":"Create an OML dataframe with selected features only","message":["%python","","SELECTED_CAT_DF = CUST_DF[selected_cat_cols]","z.show(SELECTED_CAT_DF.head())"],"enabled":true,"result":{"startTime":1715709335404,"interpreter":"python.low","endTime":1715709335554,"results":[{"message":"MARITAL_STATUS\tSTATE\tCUSTOMER_ID\tGENDER\tPROFESSION\tREGION\tBUY_INSURANCE\tLTV_BIN\tFIRST_NAME\tLAST_NAME\nSINGLE\tCA \tCU8720 \tM \tWaiter/Waitress\tWest\tYes\tMEDIUM\tKENNETH\tNADEAU\nDIVORCED\tWA \tCU8721 \tM \tSoftware Engineer\tWest\tYes\tMEDIUM\tJARVIS\tCLARKE\nDIVORCED\tNY \tCU8722 \tF \tIT Staff\tNorthEast\tNo\tVERY HIGH\tSTACEY\tHENSON\nMARRIED\tMI \tCU8665 \tF \tDentist\tMidwest\tYes\tHIGH\tMARGRET\tRYAN\nMARRIED\tMI \tCU8666 \tM \tIT Staff\tMidwest\tNo\tHIGH\tJOAN\tBAUER\n","type":"TABLE"}],"taskStatus":"SUCCESS","forms":"[]","status":"SUCCESS"},"sizeX":0,"hideCode":false,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"raw","title":"For all columns, identify columns with > 40% missing values","message":["%python","","many_missing_cols = []","ratio = 0.4","total = CUST_DF.shape[0]","threshold = total*ratio","for col in CUST_DF.columns:"," grp = CUST_DF.crosstab([col])"," if grp[ grp[col].isnull()]:"," if grp[ grp[col].isnull()].pull()['count'][0] > threshold:"," many_missing_cols.append(col)","print(many_missing_cols) "],"enabled":true,"result":{"startTime":1715709335630,"interpreter":"python.low","endTime":1715709342436,"results":[{"message":"['MARITAL_STATUS', 'CREDIT_BALANCE', 'BANK_FUNDS', 'INCOME']\n","type":"TEXT"}],"taskStatus":"SUCCESS","forms":"[]","status":"SUCCESS"},"sizeX":0,"hideCode":false,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"raw","title":"Filter 'constant' columns - those with > 90% same values","message":["%python","","total = CUST_DF.shape[0]","","ratio = 0.90","","major_constant_cols = []","for col in CUST_DF.columns:"," GROUP = CUST_DF.crosstab([col])"," "," if GROUP[GROUP['count'] > total*ratio]:"," print(GROUP[GROUP['count'] > total*ratio])"," major_constant_cols.append(col)","","print(major_constant_cols)"],"enabled":true,"result":{"startTime":1715709342514,"interpreter":"python.low","endTime":1715709349617,"results":[{"message":" CAR_OWNERSHIP count\n0 1 13859\n['CAR_OWNERSHIP']\n","type":"TEXT"}],"taskStatus":"SUCCESS","forms":"[]","status":"SUCCESS"},"sizeX":0,"hideCode":false,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"table","title":"Create an OML dataframe with 'constant' columns filtered out","message":["%python","","selected_cols = [ col for col in CUST_DF.columns if col not in major_constant_cols ]","SELECTED_DF = CUST_DF[selected_cat_cols]","z.show(SELECTED_DF.head())"],"enabled":true,"result":{"startTime":1715709349702,"interpreter":"python.low","endTime":1715709349854,"results":[{"message":"MARITAL_STATUS\tSTATE\tCUSTOMER_ID\tGENDER\tPROFESSION\tREGION\tBUY_INSURANCE\tLTV_BIN\tFIRST_NAME\tLAST_NAME\nSINGLE\tCA \tCU8720 \tM \tWaiter/Waitress\tWest\tYes\tMEDIUM\tKENNETH\tNADEAU\nDIVORCED\tWA \tCU8721 \tM \tSoftware Engineer\tWest\tYes\tMEDIUM\tJARVIS\tCLARKE\nDIVORCED\tNY \tCU8722 \tF \tIT Staff\tNorthEast\tNo\tVERY HIGH\tSTACEY\tHENSON\nMARRIED\tMI \tCU8665 \tF \tDentist\tMidwest\tYes\tHIGH\tMARGRET\tRYAN\nMARRIED\tMI \tCU8666 \tM \tIT Staff\tMidwest\tNo\tHIGH\tJOAN\tBAUER\n","type":"TABLE"}],"taskStatus":"SUCCESS","forms":"[]","status":"SUCCESS"},"sizeX":0,"hideCode":false,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"html","title":null,"message":["%md","","## End of Script"],"enabled":true,"result":{"startTime":1715709349938,"interpreter":"md.low","endTime":1715709349996,"results":[{"message":"<h1 id=\"end-of-script\">End of Script<\/h1>\n","type":"HTML"}],"taskStatus":"SUCCESS","forms":"[]","status":"SUCCESS"},"sizeX":0,"hideCode":true,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":false,"hideVizConfig":true,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"html","title":null,"message":["%md"],"enabled":true,"result":{"startTime":1715709350083,"interpreter":"md.low","endTime":1715709350140,"results":[],"taskStatus":"SUCCESS","forms":"[]","status":"SUCCESS"},"sizeX":0,"hideCode":true,"width":12,"hideResult":true,"dynamicFormParams":null,"row":0,"hasTitle":false,"hideVizConfig":true,"hideGutter":true,"relations":[],"forms":"[]"}],"version":"6","snapshot":false,"tags":null}]